{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0f32edca1ba2446a8f9a3e442e3b0d58",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "from datasets import load_dataset\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/adithya/miniconda3/envs/train-venv/lib/python3.10/site-packages/datasets/load.py:1429: FutureWarning: The repository for ai4bharat/samanantar contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/ai4bharat/samanantar\n",
      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
      "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5f79cd03f38e43acb3754f9c7c2b70f2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/7.18G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7c41ea30987a4f5e9e126c1c7499facf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset = load_dataset(\"ai4bharat/samanantar\",\"kn\",data_dir=\"./translation_datasets/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "44887838246540faa70cfea0760448f2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/2 shards):   0%|          | 0/4093524 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset.save_to_disk('translation_dataset')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_from_disk\n",
    "dataset = load_from_disk(\"./translation_dataset\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['idx', 'src', 'tgt'],\n",
       "    num_rows: 1000000\n",
       "})"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "small_dataset = dataset['train'].select([i for i in range(1000000)])\n",
    "small_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['idx', 'src', 'tgt'],\n",
       "    num_rows: 400000\n",
       "})"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from datasets import load_dataset\n",
    "from sklearn.model_selection import train_test_split\n",
    "import random\n",
    "\n",
    "# Select a small sample (e.g., 50 rows) for testing\n",
    "sampled_data = small_dataset.shuffle(seed=42).select([i for i in range(400000)])\n",
    "sampled_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "# Split into two datasets\n",
    "dataset1, dataset2 = train_test_split(sampled_data, test_size=0.5, random_state=42)\n",
    "\n",
    "# Function to add a random phrase to each row with a random position\n",
    "def add_random_phrase(data, source_lang, target_lang):\n",
    "    phrases = [\n",
    "        \"translate the following to \" + target_lang,\n",
    "        \"convert to \" + target_lang,\n",
    "        \"translate to \" + target_lang,\n",
    "        \"translate this to \" + target_lang,\n",
    "        \"what is the translation of this in \" + target_lang,\n",
    "        \" \",\n",
    "        \" \",\n",
    "        \" \",\n",
    "        \" \",\n",
    "    ]\n",
    "\n",
    "    data['text'] = data.apply(lambda row: f\"{row['src']} {random.choice(phrases)} : {row['tgt']}\" if random.choice([True, False]) else f\"{random.choice(phrases)} {row['src']} : {row['tgt']}\", axis=1)\n",
    "    return data\n",
    "\n",
    "# Create the first dataset with the same src and tgt\n",
    "df_same = add_random_phrase(pd.DataFrame({'src': dataset1['src'], 'tgt': dataset1['tgt']}), 'english', 'kannada')\n",
    "\n",
    "# Create the second dataset with inverted columns\n",
    "df_inverted = add_random_phrase(pd.DataFrame({'src': dataset2['tgt'], 'tgt': dataset2['src']}), 'kannada', 'english')\n",
    "\n",
    "# Save the datasets to CSV files or perform further processing as needed\n",
    "df_same.to_csv('./translation_dataset/dataset_same_test.csv', index=False)\n",
    "df_inverted.to_csv('./translation_dataset/dataset_inverted_test.csv', index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample of the dataset with the same src and tgt:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>src</th>\n",
       "      <th>tgt</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>193232</th>\n",
       "      <td>Lemon juice 1 tsp</td>\n",
       "      <td>ಶುಂಠಿ ರಸ 1 ಟೇಬಲ್ ಚಮಚ</td>\n",
       "      <td>Lemon juice 1 tsp : ಶುಂಠಿ ರಸ 1 ಟೇಬಲ್ ಚಮಚ</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>188555</th>\n",
       "      <td>He has been admitted to a hospital in Manipal.</td>\n",
       "      <td>ಇವರನ್ನು ಉಡುಪಿಯ ಮಣಿಪಾಲ್ ಆಸ್ಪತ್ರೆಗೆ ದಾಖಲು ಮಾಡಲಾಗ...</td>\n",
       "      <td>He has been admitted to a hospital in Manipa...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5838</th>\n",
       "      <td>Actuarial science</td>\n",
       "      <td>ಅಂಟಾರ್ಕ್ಟಿಕ್ ವಿಜ್ಞಾನ</td>\n",
       "      <td>translate the following to kannada Actuarial s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>125090</th>\n",
       "      <td>\"\"\"He is a businessman.\"</td>\n",
       "      <td>`ಅವರು (ಸಚಿವರು) ವ್ಯಾಪಾರಿ.</td>\n",
       "      <td>\"\"\"He is a businessman.\" convert to kannada : ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>181508</th>\n",
       "      <td>We didnt ask anyone to resign.</td>\n",
       "      <td>ಯಾರ ರಾಜೀನಾಮೆಯನ್ನೂ ನಾವು ಕೇಳುವುದಿಲ್ಲ.</td>\n",
       "      <td>We didnt ask anyone to resign. : ಯಾರ ರಾಜೀನಾಮ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   src  \\\n",
       "193232                               Lemon juice 1 tsp   \n",
       "188555  He has been admitted to a hospital in Manipal.   \n",
       "5838                                 Actuarial science   \n",
       "125090                        \"\"\"He is a businessman.\"   \n",
       "181508                  We didnt ask anyone to resign.   \n",
       "\n",
       "                                                      tgt  \\\n",
       "193232                               ಶುಂಠಿ ರಸ 1 ಟೇಬಲ್ ಚಮಚ   \n",
       "188555  ಇವರನ್ನು ಉಡುಪಿಯ ಮಣಿಪಾಲ್ ಆಸ್ಪತ್ರೆಗೆ ದಾಖಲು ಮಾಡಲಾಗ...   \n",
       "5838                                 ಅಂಟಾರ್ಕ್ಟಿಕ್ ವಿಜ್ಞಾನ   \n",
       "125090                           `ಅವರು (ಸಚಿವರು) ವ್ಯಾಪಾರಿ.   \n",
       "181508                ಯಾರ ರಾಜೀನಾಮೆಯನ್ನೂ ನಾವು ಕೇಳುವುದಿಲ್ಲ.   \n",
       "\n",
       "                                                     text  \n",
       "193232           Lemon juice 1 tsp : ಶುಂಠಿ ರಸ 1 ಟೇಬಲ್ ಚಮಚ  \n",
       "188555    He has been admitted to a hospital in Manipa...  \n",
       "5838    translate the following to kannada Actuarial s...  \n",
       "125090  \"\"\"He is a businessman.\" convert to kannada : ...  \n",
       "181508    We didnt ask anyone to resign. : ಯಾರ ರಾಜೀನಾಮ...  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Sample of the dataset with inverted columns:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>src</th>\n",
       "      <th>tgt</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>14516</th>\n",
       "      <td>ಮಕ್ಕಳು ನಮ್ಮ ಸಮಾಜದ ಆಸ್ತಿ, ದೇಶದ ಮುಂದಿನ ಪ್ರಜೆಗಳು.</td>\n",
       "      <td>Children are future citizens of our society.</td>\n",
       "      <td>ಮಕ್ಕಳು ನಮ್ಮ ಸಮಾಜದ ಆಸ್ತಿ, ದೇಶದ ಮುಂದಿನ ಪ್ರಜೆಗಳ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>83418</th>\n",
       "      <td>\"\"\" ಇದು ಎರಡು ರೀತಿಯಲ್ಲಿ ಕೊಡಬಹುದು.\"</td>\n",
       "      <td>This could have happened in two ways.</td>\n",
       "      <td>\"\"\" ಇದು ಎರಡು ರೀತಿಯಲ್ಲಿ ಕೊಡಬಹುದು.\" translate ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43369</th>\n",
       "      <td>ಹುದ್ದೆ ಮತ್ತು ಅರ್ಹತೆ ವಿವರ</td>\n",
       "      <td>Jobs and Qualification</td>\n",
       "      <td>ಹುದ್ದೆ ಮತ್ತು ಅರ್ಹತೆ ವಿವರ   : Jobs and Qualific...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67542</th>\n",
       "      <td>ಆದರೆ “ಯಾವಾಗಲೂ ನಗುನಗುತ್ತಾ ತುಂಬ ಸಂತೋಷವಾಗಿದ್ದ ಕುಟ...</td>\n",
       "      <td>No one ever saw a trailer like it before, reca...</td>\n",
       "      <td>ಆದರೆ “ಯಾವಾಗಲೂ ನಗುನಗುತ್ತಾ ತುಂಬ ಸಂತೋಷವಾಗಿದ್ದ ಕುಟ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>160483</th>\n",
       "      <td>ನನಗೆ ನಡೆದಾಡಲು ಸಾಧ್ಯವಾಗುತ್ತಿಲ್ಲ.</td>\n",
       "      <td>I cannot walk.</td>\n",
       "      <td>translate to english ನನಗೆ ನಡೆದಾಡಲು ಸಾಧ್ಯವಾಗುತ್...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                      src  \\\n",
       "14516      ಮಕ್ಕಳು ನಮ್ಮ ಸಮಾಜದ ಆಸ್ತಿ, ದೇಶದ ಮುಂದಿನ ಪ್ರಜೆಗಳು.   \n",
       "83418                 \"\"\" ಇದು ಎರಡು ರೀತಿಯಲ್ಲಿ ಕೊಡಬಹುದು.\"   \n",
       "43369                            ಹುದ್ದೆ ಮತ್ತು ಅರ್ಹತೆ ವಿವರ   \n",
       "67542   ಆದರೆ “ಯಾವಾಗಲೂ ನಗುನಗುತ್ತಾ ತುಂಬ ಸಂತೋಷವಾಗಿದ್ದ ಕುಟ...   \n",
       "160483                    ನನಗೆ ನಡೆದಾಡಲು ಸಾಧ್ಯವಾಗುತ್ತಿಲ್ಲ.   \n",
       "\n",
       "                                                      tgt  \\\n",
       "14516        Children are future citizens of our society.   \n",
       "83418               This could have happened in two ways.   \n",
       "43369                              Jobs and Qualification   \n",
       "67542   No one ever saw a trailer like it before, reca...   \n",
       "160483                                     I cannot walk.   \n",
       "\n",
       "                                                     text  \n",
       "14516     ಮಕ್ಕಳು ನಮ್ಮ ಸಮಾಜದ ಆಸ್ತಿ, ದೇಶದ ಮುಂದಿನ ಪ್ರಜೆಗಳ...  \n",
       "83418   \"\"\" ಇದು ಎರಡು ರೀತಿಯಲ್ಲಿ ಕೊಡಬಹುದು.\" translate ...  \n",
       "43369   ಹುದ್ದೆ ಮತ್ತು ಅರ್ಹತೆ ವಿವರ   : Jobs and Qualific...  \n",
       "67542   ಆದರೆ “ಯಾವಾಗಲೂ ನಗುನಗುತ್ತಾ ತುಂಬ ಸಂತೋಷವಾಗಿದ್ದ ಕುಟ...  \n",
       "160483  translate to english ನನಗೆ ನಡೆದಾಡಲು ಸಾಧ್ಯವಾಗುತ್...  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Display a sample of the datasets\n",
    "print(\"Sample of the dataset with the same src and tgt:\")\n",
    "display(df_same.sample(5))\n",
    "\n",
    "print(\"\\nSample of the dataset with inverted columns:\")\n",
    "display(df_inverted.sample(5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset({\n",
      "    features: ['src', 'tgt', 'text'],\n",
      "    num_rows: 200000\n",
      "})\n",
      "Dataset({\n",
      "    features: ['src', 'tgt', 'text'],\n",
      "    num_rows: 200000\n",
      "})\n"
     ]
    }
   ],
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "# Convert pandas DataFrames to datasets\n",
    "dataset_same = Dataset.from_pandas(df_same)\n",
    "dataset_inverted = Dataset.from_pandas(df_inverted)\n",
    "\n",
    "# # Save datasets to disk\n",
    "# dataset_same.save_to_disk('./translation_datasets/dataset_same')\n",
    "# dataset_inverted.save_to_disk('./translation_datasets/dataset_inverted')\n",
    "print(dataset_same)\n",
    "print(dataset_inverted)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b928af95b59242888d9a963cb89b2f79",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "95ae866f66d141b694f8dc7a059a0735",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/200 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9f446c28ec894d13bbd9eb51f123ad12",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "238d84e52b604a13a01d5aeec2ca4434",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/200 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/CognitiveLab/Kannada_to_English/commit/dd23e0cda3bc694ce0202e36d355c9f44123a48b', commit_message='Upload dataset', commit_description='', oid='dd23e0cda3bc694ce0202e36d355c9f44123a48b', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Upload datasets to Hugging Face Hub (requires authentication)\n",
    "# Make sure to replace 'username' and 'dataset-name' with your Hugging Face username and desired dataset name\n",
    "dataset_same.push_to_hub('CognitiveLab/English_to_Kannada' ,private=True)\n",
    "dataset_inverted.push_to_hub('CognitiveLab/Kannada_to_English',private=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>src</th>\n",
       "      <th>tgt</th>\n",
       "      <th>text</th>\n",
       "      <th>translation_direction</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Abdul Jaleel K., the district president of the...</td>\n",
       "      <td>ಕಾರ್ಯಕ್ರಮದ ಅಧ್ಯಕ್ಷತೆಯನ್ನು ಎಸ್ ಡಿ ಪಿ ಐ ಸುಳ್ಯ ಕ್...</td>\n",
       "      <td>Abdul Jaleel K., the district president of the...</td>\n",
       "      <td>en_to_kn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\"Write this down.\"\"\"</td>\n",
       "      <td>ಅದನ್ನು ಬರೆದುಕೊಳ್ಳಿ' ಎಂದು ನಿಜಕ್ಕೂ ಸತ್ತುಹೋದ.</td>\n",
       "      <td>\"Write this down.\"\"\" translate to kannada : ಅದ...</td>\n",
       "      <td>en_to_kn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>RTI office</td>\n",
       "      <td>ಆರ್ ಟಿಓ ಕಚೇರಿ</td>\n",
       "      <td>RTI office : ಆರ್ ಟಿಓ ಕಚೇರಿ</td>\n",
       "      <td>en_to_kn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Sa in Sanskrit is to have, and Misha in Russia...</td>\n",
       "      <td>ಸ ಎಂದರೆ ಸಂಸ್ಕೃತದಲ್ಲಿ ಹೊಂದುವುದು ಎಂದರ್ಥ, ಹಾಗೆಯೇ ...</td>\n",
       "      <td>translate this to kannada Sa in Sanskrit is to...</td>\n",
       "      <td>en_to_kn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Thousands of trucks carrying essential commodi...</td>\n",
       "      <td>ಕರ್ನಾಟಕ ಮತ್ತು ಮಹಾರಾಷ್ಟ್ರಕ್ಕೆ ಸಂಪರ್ಕ ಕಲ್ಪಿಸುವ ಈ...</td>\n",
       "      <td>translate the following to kannada Thousands o...</td>\n",
       "      <td>en_to_kn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199995</th>\n",
       "      <td>The mind is fickle.</td>\n",
       "      <td>ಮನಸ್ಸು ಚುರುಕ್ ಅಂತು.</td>\n",
       "      <td>translate the following to kannada The mind is...</td>\n",
       "      <td>en_to_kn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199996</th>\n",
       "      <td>\"\"\"Hey pops.\"</td>\n",
       "      <td>\"\"\"ಮೇರಿ ಪಾಪಿನ್ಸ್.\"</td>\n",
       "      <td>convert to kannada \"\"\"Hey pops.\" : \"\"\"ಮೇರಿ ಪಾ...</td>\n",
       "      <td>en_to_kn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199997</th>\n",
       "      <td>He was earlier associated with Aam Aadmi Party.</td>\n",
       "      <td>ಅವರು ಈ ಹಿಂದೆ ಅಕ್ ಪಾರ್ಟಿಯ ಮೈತ್ರಿಯಲ್ಲಿದ್ದರು.</td>\n",
       "      <td>He was earlier associated with Aam Aadmi Party...</td>\n",
       "      <td>en_to_kn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199998</th>\n",
       "      <td>The film was directed by Raj.</td>\n",
       "      <td>ಚಿತ್ರವನ್ನು ರಾಜ್ ಬಿಎನ್ ನಿರ್ದೇಶಿಸಿದ್ದಾರೆ.</td>\n",
       "      <td>The film was directed by Raj. translate the fo...</td>\n",
       "      <td>en_to_kn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199999</th>\n",
       "      <td>As India looks to win the remaining two Tests ...</td>\n",
       "      <td>ಮುಂದಿನ ಎರಡು ಟೆಸ್ಟ್‌ ಪಂದ್ಯಗಳ ಪೈಕಿ ಕನಿಷ್ಠ ಒಂದು ಪ...</td>\n",
       "      <td>As India looks to win the remaining two Test...</td>\n",
       "      <td>en_to_kn</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>200000 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                      src  \\\n",
       "0       Abdul Jaleel K., the district president of the...   \n",
       "1                                    \"Write this down.\"\"\"   \n",
       "2                                              RTI office   \n",
       "3       Sa in Sanskrit is to have, and Misha in Russia...   \n",
       "4       Thousands of trucks carrying essential commodi...   \n",
       "...                                                   ...   \n",
       "199995                                The mind is fickle.   \n",
       "199996                                      \"\"\"Hey pops.\"   \n",
       "199997    He was earlier associated with Aam Aadmi Party.   \n",
       "199998                      The film was directed by Raj.   \n",
       "199999  As India looks to win the remaining two Tests ...   \n",
       "\n",
       "                                                      tgt  \\\n",
       "0       ಕಾರ್ಯಕ್ರಮದ ಅಧ್ಯಕ್ಷತೆಯನ್ನು ಎಸ್ ಡಿ ಪಿ ಐ ಸುಳ್ಯ ಕ್...   \n",
       "1              ಅದನ್ನು ಬರೆದುಕೊಳ್ಳಿ' ಎಂದು ನಿಜಕ್ಕೂ ಸತ್ತುಹೋದ.   \n",
       "2                                           ಆರ್ ಟಿಓ ಕಚೇರಿ   \n",
       "3       ಸ ಎಂದರೆ ಸಂಸ್ಕೃತದಲ್ಲಿ ಹೊಂದುವುದು ಎಂದರ್ಥ, ಹಾಗೆಯೇ ...   \n",
       "4       ಕರ್ನಾಟಕ ಮತ್ತು ಮಹಾರಾಷ್ಟ್ರಕ್ಕೆ ಸಂಪರ್ಕ ಕಲ್ಪಿಸುವ ಈ...   \n",
       "...                                                   ...   \n",
       "199995                                ಮನಸ್ಸು ಚುರುಕ್ ಅಂತು.   \n",
       "199996                                \"\"\"ಮೇರಿ ಪಾಪಿನ್ಸ್.\"   \n",
       "199997         ಅವರು ಈ ಹಿಂದೆ ಅಕ್ ಪಾರ್ಟಿಯ ಮೈತ್ರಿಯಲ್ಲಿದ್ದರು.   \n",
       "199998            ಚಿತ್ರವನ್ನು ರಾಜ್ ಬಿಎನ್ ನಿರ್ದೇಶಿಸಿದ್ದಾರೆ.   \n",
       "199999  ಮುಂದಿನ ಎರಡು ಟೆಸ್ಟ್‌ ಪಂದ್ಯಗಳ ಪೈಕಿ ಕನಿಷ್ಠ ಒಂದು ಪ...   \n",
       "\n",
       "                                                     text  \\\n",
       "0       Abdul Jaleel K., the district president of the...   \n",
       "1       \"Write this down.\"\"\" translate to kannada : ಅದ...   \n",
       "2                              RTI office : ಆರ್ ಟಿಓ ಕಚೇರಿ   \n",
       "3       translate this to kannada Sa in Sanskrit is to...   \n",
       "4       translate the following to kannada Thousands o...   \n",
       "...                                                   ...   \n",
       "199995  translate the following to kannada The mind is...   \n",
       "199996  convert to kannada \"\"\"Hey pops.\" : \"\"\"ಮೇರಿ ಪಾ...   \n",
       "199997  He was earlier associated with Aam Aadmi Party...   \n",
       "199998  The film was directed by Raj. translate the fo...   \n",
       "199999    As India looks to win the remaining two Test...   \n",
       "\n",
       "       translation_direction  \n",
       "0                   en_to_kn  \n",
       "1                   en_to_kn  \n",
       "2                   en_to_kn  \n",
       "3                   en_to_kn  \n",
       "4                   en_to_kn  \n",
       "...                      ...  \n",
       "199995              en_to_kn  \n",
       "199996              en_to_kn  \n",
       "199997              en_to_kn  \n",
       "199998              en_to_kn  \n",
       "199999              en_to_kn  \n",
       "\n",
       "[200000 rows x 4 columns]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# df_same = pd.read_csv('./translation_datasets/dataset_same_test.csv')\n",
    "# df_inverted = pd.read_csv('./translation_datasets/dataset_inverted_test.csv')\n",
    "\n",
    "# Add extra field for \"en_to_kn\" or \"kn_to_en\"\n",
    "df_same['translation_direction'] = 'en_to_kn'\n",
    "df_inverted['translation_direction'] = 'kn_to_en'\n",
    "\n",
    "# Combine both datasets\n",
    "df_same"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['src', 'tgt', 'text', 'translation_direction'],\n",
       "    num_rows: 400000\n",
       "})"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combined_df = pd.concat([df_same, df_inverted], ignore_index=True)\n",
    "\n",
    "# Shuffle the combined dataset randomly\n",
    "combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)\n",
    "\n",
    "# Convert the combined DataFrame to a dataset\n",
    "combined_dataset = Dataset.from_pandas(combined_df)\n",
    "\n",
    "combined_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "aba7694939164473a766705dd8072e2b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/400000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1c79be9d64364576b258e48304ba46f1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "caa44b25e3dc479ba599e61605b85839",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/400 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/CognitiveLab/translation-combined-dataset/commit/f14c910e390827fc217c783c9cee01e96f21adc7', commit_message='Upload dataset', commit_description='', oid='f14c910e390827fc217c783c9cee01e96f21adc7', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Save the combined dataset to disk\n",
    "combined_dataset.save_to_disk('./translation_datasets/combined_dataset')\n",
    "\n",
    "# Upload the combined dataset to Hugging Face Hub (requires authentication)\n",
    "# Make sure to replace 'username' and 'dataset-name' with your Hugging Face username and desired dataset name\n",
    "combined_dataset.push_to_hub('CognitiveLab/translation-combined-dataset')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "train-venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
